#We want to investigate 2 questions:
#1) how is region associated with the number of reported COVID-19 cases, AND
#2) how is stance on mask usage associated with the subsequent number of cases in a given area?
#Import the data-set. Source: NY Times COVID-19 Master Data-Set: College Data-Set.
college<-read.csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/colleges/colleges.csv')
#Explore the college data-set.
head(college)
## date state county city ipeds_id
## 1 2021-05-26 Alabama Madison Huntsville 100654
## 2 2021-05-26 Alabama Montgomery Montgomery 100724
## 3 2021-05-26 Alabama Limestone Athens 100812
## 4 2021-05-26 Alabama Lee Auburn 100858
## 5 2021-05-26 Alabama Montgomery Montgomery 100830
## 6 2021-05-26 Alabama Walker Jasper 102429
## college cases cases_2021 notes
## 1 Alabama A&M University 41 NA
## 2 Alabama State University 2 NA
## 3 Athens State University 45 10
## 4 Auburn University 2742 567
## 5 Auburn University at Montgomery 220 80
## 6 Bevill State Community College 4 NA
str(college)
## 'data.frame': 1948 obs. of 9 variables:
## $ date : chr "2021-05-26" "2021-05-26" "2021-05-26" "2021-05-26" ...
## $ state : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ county : chr "Madison" "Montgomery" "Limestone" "Lee" ...
## $ city : chr "Huntsville" "Montgomery" "Athens" "Auburn" ...
## $ ipeds_id : chr "100654" "100724" "100812" "100858" ...
## $ college : chr "Alabama A&M University" "Alabama State University" "Athens State University" "Auburn University" ...
## $ cases : int 41 2 45 2742 220 4 263 137 49 76 ...
## $ cases_2021: int NA NA 10 567 80 NA 49 53 10 35 ...
## $ notes : chr "" "" "" "" ...
#Create a summary of case numbers by state.
state_summary <- college %>%
group_by(state) %>%
summarise(cases=mean(cases, na.rm = T))
#View the summary of case numbers by state.
state_summary
## # A tibble: 57 x 2
## state cases
## <chr> <dbl>
## 1 Alabama 499.
## 2 Alaska 108.
## 3 American Samoa 0
## 4 Arizona 1824.
## 5 Arkansas 356.
## 6 California 241.
## 7 Colorado 472.
## 8 Connecticut 426.
## 9 Delaware 408
## 10 Florida 252.
## # … with 47 more rows
#Create a graph displaying case numbers by state. This is pretty messy, so we will tidy it up.
ggplot(state_summary, aes(x=cases, y=state)) +
geom_point()

#Using fct_infreq, we can create an ordered plot that appears much cleaner; however, we still need to clean up the legend.
college %>%
mutate(state = fct_infreq(state)) %>%
ggplot(aes(state)) +
geom_bar()

#Create a visualization that groups states by regions.
college %>%
drop_na(cases) %>%
drop_na(state) %>%
mutate(statebyregion = fct_collapse(state,
"Northeast" = c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", "Vermont", "New York", "Pennsylvania", "New Jersey", "Delaware"),
"South" = c("Arkansas", "Louisiana", "Oklahoma", "Texas", "Alabama", "Kentucky", "Mississippi", "Tennessee", "Washington, D.C.", "Florida", "Georgia", "Maryland", "North Carolina", "South Carolina", "West Virginia", "Virginia"),
"Midwest" = c("Indiana", "Illinois", "Michigan", "Ohio", "Wisconsin", "Iowa", "Nebraska", "Kansas", "North Dakota", "Minnesota", "South Dakota", "Missouri"),
"West" = "Arizona", "Colorado", "Idaho", "New Mexico", "Montana", "Utah", "Nevada", "Wyoming", "Alaska", "California", "Hawaii", "Oregon", "Washington")) %>%
group_by(statebyregion) %>%
summarise(meancases = mean(cases)) %>%
ggplot(aes(x=meancases,
y=fct_reorder(statebyregion, meancases), col=factor(statebyregion))) +
geom_point(aes(size = meancases))

#Create a summary of 2021 case numbers by state.
state_2021_summary <- college %>%
group_by(state) %>%
summarise(cases_2021=mean(cases_2021, na.rm = T))
#View the summary of 2021 case numbers by state. These values appear to be lower and less comprehensive than the values in the case variable. When examining information about the NY Times data set, authors outline that 'cases' include all cases between 2020-2021 that were recorded as data points; furthermore, 'cases_2021' include all cases recorded as data points after January 1st, 2021. For this reason, we will rely on the 'cases' variable.
state_2021_summary
## # A tibble: 57 x 2
## state cases_2021
## <chr> <dbl>
## 1 Alabama 164.
## 2 Alaska 44.3
## 3 American Samoa 0
## 4 Arizona 680
## 5 Arkansas 107.
## 6 California 137.
## 7 Colorado 226.
## 8 Connecticut 229.
## 9 Delaware 284.
## 10 Florida 232.
## # … with 47 more rows
#Repeat the same visualization process for cases in 2021 by region (states and territories in the U.S.).
college %>%
drop_na(cases_2021) %>%
drop_na(state) %>%
mutate(statebyregion = fct_collapse(state,
"Northeast" = c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", "Vermont", "New York", "Pennsylvania", "New Jersey", "Delaware"),
"South" = c("Arkansas", "Louisiana", "Oklahoma", "Texas", "Alabama", "Kentucky", "Mississippi", "Tennessee", "Washington, D.C.", "Florida", "Georgia", "Maryland", "North Carolina", "South Carolina", "West Virginia", "Virginia"),
"Midwest" = c("Indiana", "Illinois", "Michigan", "Ohio", "Wisconsin", "Iowa", "Nebraska", "Kansas", "North Dakota", "Minnesota", "South Dakota", "Missouri"),
"West" = "Arizona", "Colorado", "Idaho", "New Mexico", "Montana", "Utah", "Nevada", "Wyoming", "Alaska", "California", "Hawaii", "Oregon", "Washington")) %>%
group_by(statebyregion) %>%
summarise(meancases2021 = mean(cases_2021)) %>%
ggplot(aes(x=meancases2021,
y=fct_reorder(statebyregion, meancases2021), col=factor(statebyregion))) +
geom_point(aes(size = meancases2021))

#This graph changes the order of regions most affected by COVID-19 case numbers. Here, we see the Midwest underneath the South and Northeast. West is still the region most affected, which makes sense given it reported the highest number of cases between 2020-2021. Because we believe the 'cases' variable to be more representative than the 'cases_2021' variable, we will proceed using the 'cases' variable for the remainder of our analysis; however, it's important to compare both using visualizations and note the differences.
#Create a Shiny Web App:
ui <- fluidPage(
#Create an interactive COVID-19 geography explorer.
titlePanel("COVID-19 Geography Explorer: Cities & Colleges"),
sidebarLayout(
#Prompt users to enter a city name. Set the default to Atlanta.
sidebarPanel(textInput('city', 'Enter City', 'Atlanta')),
mainPanel(tabsetPanel(tabPanel("Plot", plotOutput('plotcases')))
)
)
)
server <- function(input, output, session) {
output$plotcases <- renderPlot({
ggplot(subset(college, city == input$city)) +
#Plot city by cases of COVID-19.
#Color the points according to college and set size according to number of cases.
geom_point(aes(x = city, y = cases, color = college, size = cases))
})
}
shinyApp(ui = ui, server = server)
Shiny applications not supported in static R Markdown documents
#Create an interactive plot_ly visualization that allows users to investigate case numbers by city.
plot_ly(college, x = ~city, y = ~cases, type = "scatter", mode = "markers", color = ~state) %>%
add_markers()
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
#Create an interactive plot_ly visualization that allows users to investigate case numbers by county.
plot_ly(college, x = ~county, y = ~cases, type = "scatter", mode = "markers", color = ~state) %>%
add_markers()
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
#Create a Shiny Web App w/ Plotly:
ui <- fluidPage(
#Create an interactive COVID-19 geography explorer.
titlePanel("COVID-19 Geography Explorer: County Data"),
sidebarLayout(
#Prompt users to enter a county name. Set the default to Dekalb county
sidebarPanel(textInput('county', 'Enter County', 'Dekalb')),
mainPanel(tabsetPanel(tabPanel("Plot", plotlyOutput('plotcases')),
tabPanel("Table", tableOutput('tablecases')))
)
)
)
server <- function(input, output, session) {
output$plotcases <- renderPlotly({
plot_ly(college, x=~input$county, y=~cases, type = 'scatter', mode = 'markers')
})
#Display a table with total number of cases. This will allow for comparison between number of cases per county vs. total number of cases.
output$tablecases <- renderTable({
sum(college$cases)
})
}
shinyApp(ui = ui, server = server)
Shiny applications not supported in static R Markdown documents
#Import the masks dataset from the NY Times research page on Github:
masks<-read.csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/mask-use/mask-use-by-county.csv', header = TRUE)
#Explore the head of the data-set:
head(masks)
## COUNTYFP NEVER RARELY SOMETIMES FREQUENTLY ALWAYS
## 1 1001 0.053 0.074 0.134 0.295 0.444
## 2 1003 0.083 0.059 0.098 0.323 0.436
## 3 1005 0.067 0.121 0.120 0.201 0.491
## 4 1007 0.020 0.034 0.096 0.278 0.572
## 5 1009 0.053 0.114 0.180 0.194 0.459
## 6 1011 0.031 0.040 0.144 0.286 0.500
#Import the FIPS_county_state CSV file, derived from the USDA, which is the source that NY Times used to obtain the data for their "COUNTYFP" column in the 'masks' data-set. Here is the link to the USDA source: https://www.nrcs.usda.gov/wps/portal/nrcs/detail/national/home/?cid=nrcs143_013697
FIPS_county<-read.csv('FIPS_county_state.csv', header = TRUE)
#Explore the FIPS_county dataset, which we will merge the 'masks' dataset with.
head(FIPS_county)
## COUNTYFP Name State
## 1 1001 Autauga AL
## 2 1003 Baldwin AL
## 3 1005 Barbour AL
## 4 1007 Bibb AL
## 5 1009 Blount AL
## 6 1011 Bullock AL
str(FIPS_county)
## 'data.frame': 3232 obs. of 3 variables:
## $ COUNTYFP: int 1001 1003 1005 1007 1009 1011 1013 1015 1017 1019 ...
## $ Name : chr "Autauga" "Baldwin" "Barbour" "Bibb" ...
## $ State : chr "AL" "AL" "AL" "AL" ...
#Join the FIPS_county data-set with the masks data-set. We will specify the verb 'inner' for our join in order to retain only matching observations (when it comes to the COUNTYFP column).
masks_county <- masks %>%
full_join(FIPS_county, by = c("COUNTYFP"))
#Explore the head of the combined data-sets (i.e. 'tables'). It appears the join was successful.
head(masks_county)
## COUNTYFP NEVER RARELY SOMETIMES FREQUENTLY ALWAYS Name State
## 1 1001 0.053 0.074 0.134 0.295 0.444 Autauga AL
## 2 1003 0.083 0.059 0.098 0.323 0.436 Baldwin AL
## 3 1005 0.067 0.121 0.120 0.201 0.491 Barbour AL
## 4 1007 0.020 0.034 0.096 0.278 0.572 Bibb AL
## 5 1009 0.053 0.114 0.180 0.194 0.459 Blount AL
## 6 1011 0.031 0.040 0.144 0.286 0.500 Bullock AL
#By using sum(is.na(masks_county$Name)), we noticed that 9 rows have NA values in Name column
#Using head(subset(masks_county, is.na(masks_county$Name))) the rows were checked. The Name and State, both are missing for these rows but there is data in all the columns pertaining to Masks dataset.
#Let's remove these rows
row.contains.na <- apply(masks_county, 1, function(x){any(is.na(x))}) #sum(row.contains.na)
masks_county <- masks_county[!row.contains.na,]
#Explore the head of the combined data-sets (i.e. 'tables').
head(masks_county)
## COUNTYFP NEVER RARELY SOMETIMES FREQUENTLY ALWAYS Name State
## 1 1001 0.053 0.074 0.134 0.295 0.444 Autauga AL
## 2 1003 0.083 0.059 0.098 0.323 0.436 Baldwin AL
## 3 1005 0.067 0.121 0.120 0.201 0.491 Barbour AL
## 4 1007 0.020 0.034 0.096 0.278 0.572 Bibb AL
## 5 1009 0.053 0.114 0.180 0.194 0.459 Blount AL
## 6 1011 0.031 0.040 0.144 0.286 0.500 Bullock AL
#Note to Professor----Hridansh completed this chunk of code.
#Usage of masks based on the survey
S_N<-mean(masks_county$NEVER)
S_R<-mean(masks_county$RARELY)
S_S<-mean(masks_county$SOMETIMES)
S_F<-mean(masks_county$FREQUENTLY)
S_A<-mean(masks_county$ALWAYS)
survey<-c(S_N,S_R,S_S,S_F,S_A)
labels<-c("NEVER", "RARELY", "SOMETIMES", "FREQUENTLY", "ALWAYS")
pie_percent<-round(survey/sum(survey),3)
pie_labels<-paste0((survey))
pie(survey,labels=pie_percent, col=rainbow(length(survey)),main = "Usage of Masks")
legend("topright",c("NEVER", "RARELY", "SOMETIMES", "FREQUENTLY", "ALWAYS"),cex=0.8, fill=rainbow(length(survey)))

#Note to Professor----Hridansh completed this chunk of code.
#Rename the 'Name' column in our new data-set, masks_county, so that we can join it with the college data-set.
names(masks_county)[7] <- 'county'
#Let's examine the newly named colum to ensure we can proceed with joining masks_county to the college data-set. It looks good!
head(masks_county)
## COUNTYFP NEVER RARELY SOMETIMES FREQUENTLY ALWAYS county State
## 1 1001 0.053 0.074 0.134 0.295 0.444 Autauga AL
## 2 1003 0.083 0.059 0.098 0.323 0.436 Baldwin AL
## 3 1005 0.067 0.121 0.120 0.201 0.491 Barbour AL
## 4 1007 0.020 0.034 0.096 0.278 0.572 Bibb AL
## 5 1009 0.053 0.114 0.180 0.194 0.459 Blount AL
## 6 1011 0.031 0.040 0.144 0.286 0.500 Bullock AL
#Let's remove the state column from our masks_county data-set. This will avoid any errors when joining it to our college data-set.
masks_college_nostate <- subset(masks_county, select = -State)
#Create a new data-set that combines our mask data and college data.
masks_college<- masks_college_nostate %>%
inner_join(college, by = c('county'))
#Let's examine the join. It appears successful, as the state column does not appear to contain errors.
head(masks_college)
## COUNTYFP NEVER RARELY SOMETIMES FREQUENTLY ALWAYS county date
## 1 1003 0.083 0.059 0.098 0.323 0.436 Baldwin 2021-05-26
## 2 1003 0.083 0.059 0.098 0.323 0.436 Baldwin 2021-05-26
## 3 1005 0.067 0.121 0.120 0.201 0.491 Barbour 2021-05-26
## 4 1007 0.020 0.034 0.096 0.278 0.572 Bibb 2021-05-26
## 5 1007 0.020 0.034 0.096 0.278 0.572 Bibb 2021-05-26
## 6 1007 0.020 0.034 0.096 0.278 0.572 Bibb 2021-05-26
## state city ipeds_id college cases
## 1 Georgia Milledgeville 139861 Georgia College & State University 892
## 2 Georgia Milledgeville 485111 Georgia Military College 1
## 3 West Virginia Philippi 237118 Alderson Broaddus University 170
## 4 Georgia Macon 140447 Mercer University 651
## 5 Georgia Macon 482158 Middle Georgia State University 479
## 6 Georgia Macon 141325 Wesleyan College 50
## cases_2021 notes
## 1 121
## 2 NA
## 3 111
## 4 249
## 5 208
## 6 23
#Let's continue tidying up the data-set. We want to remove the ipeds_id column as it only proves the colleges in our data-set are nationally registered. It is not needed for this analysis.
masks_college2 <- subset(masks_college, select = -ipeds_id)
#To make sure removal was successful, let's explore the head of the masks_college2 data-set. It appears correct.
head(masks_college2)
## COUNTYFP NEVER RARELY SOMETIMES FREQUENTLY ALWAYS county date
## 1 1003 0.083 0.059 0.098 0.323 0.436 Baldwin 2021-05-26
## 2 1003 0.083 0.059 0.098 0.323 0.436 Baldwin 2021-05-26
## 3 1005 0.067 0.121 0.120 0.201 0.491 Barbour 2021-05-26
## 4 1007 0.020 0.034 0.096 0.278 0.572 Bibb 2021-05-26
## 5 1007 0.020 0.034 0.096 0.278 0.572 Bibb 2021-05-26
## 6 1007 0.020 0.034 0.096 0.278 0.572 Bibb 2021-05-26
## state city college cases
## 1 Georgia Milledgeville Georgia College & State University 892
## 2 Georgia Milledgeville Georgia Military College 1
## 3 West Virginia Philippi Alderson Broaddus University 170
## 4 Georgia Macon Mercer University 651
## 5 Georgia Macon Middle Georgia State University 479
## 6 Georgia Macon Wesleyan College 50
## cases_2021 notes
## 1 121
## 2 NA
## 3 111
## 4 249
## 5 208
## 6 23
#As stated earlier, we will be focusing on total case numbers rather than 2021 cases for the remainder of our analysis. Let's remove this column as well.
masks_college3 <- subset(masks_college2, select = -cases_2021)
#Explore the head of the data. Removal of the cases_2021 column appears successful.
head(masks_college3)
## COUNTYFP NEVER RARELY SOMETIMES FREQUENTLY ALWAYS county date
## 1 1003 0.083 0.059 0.098 0.323 0.436 Baldwin 2021-05-26
## 2 1003 0.083 0.059 0.098 0.323 0.436 Baldwin 2021-05-26
## 3 1005 0.067 0.121 0.120 0.201 0.491 Barbour 2021-05-26
## 4 1007 0.020 0.034 0.096 0.278 0.572 Bibb 2021-05-26
## 5 1007 0.020 0.034 0.096 0.278 0.572 Bibb 2021-05-26
## 6 1007 0.020 0.034 0.096 0.278 0.572 Bibb 2021-05-26
## state city college cases notes
## 1 Georgia Milledgeville Georgia College & State University 892
## 2 Georgia Milledgeville Georgia Military College 1
## 3 West Virginia Philippi Alderson Broaddus University 170
## 4 Georgia Macon Mercer University 651
## 5 Georgia Macon Middle Georgia State University 479
## 6 Georgia Macon Wesleyan College 50
#Now, to finish up tidying the data, we will remove the notes column. Although the notes are helpful, it does not contain any information we need to perform further analysis.
masks_college_cases <- subset(masks_college3, select = -notes)
#Explore the head of the masks_college_cases data-set. From there, we will perform our analysis and create visualizations.
head(masks_college_cases)
## COUNTYFP NEVER RARELY SOMETIMES FREQUENTLY ALWAYS county date
## 1 1003 0.083 0.059 0.098 0.323 0.436 Baldwin 2021-05-26
## 2 1003 0.083 0.059 0.098 0.323 0.436 Baldwin 2021-05-26
## 3 1005 0.067 0.121 0.120 0.201 0.491 Barbour 2021-05-26
## 4 1007 0.020 0.034 0.096 0.278 0.572 Bibb 2021-05-26
## 5 1007 0.020 0.034 0.096 0.278 0.572 Bibb 2021-05-26
## 6 1007 0.020 0.034 0.096 0.278 0.572 Bibb 2021-05-26
## state city college cases
## 1 Georgia Milledgeville Georgia College & State University 892
## 2 Georgia Milledgeville Georgia Military College 1
## 3 West Virginia Philippi Alderson Broaddus University 170
## 4 Georgia Macon Mercer University 651
## 5 Georgia Macon Middle Georgia State University 479
## 6 Georgia Macon Wesleyan College 50
#Let's create visualizations with our new data-set: masks_college_cases. We will first use plot_ly to eye-ball if there is a relationship or association between a state reporting 'frequent' mask usage and the number of cases reported in that state.
plot_ly(masks_college_cases, x = ~state, y = ~cases, type = "scatter", mode = "markers", color = ~FREQUENTLY) %>%
add_markers()
#Create a plot_ly visualization that allows us to investigate the association between case numbers in a state and reportedly never wearing masks. The states with low case numbers but high tendency not to wear masks appear to have lower populations, which is something useful to look into if further analysis is conducted on the topic.
plot_ly(masks_college_cases, x = ~state, y = ~cases, type = "scatter", mode = "markers", color = ~NEVER) %>%
add_markers()